#include <stdlib.h>
#include <stdio.h>
#include <math.h>

#include <cuda_runtime.h>
#include <device_launch_parameters.h>

extern "C" void cudaTest(int* a, int* b, int aL);			// It is good practise to declare your externals at the start of each
extern "C" void nBody(float4* a, float3* b, int nBodies);	// CUDA (.cu) file, so you know what functions are stored where!


__device__ void vectorAddition(int* x, int* a, int* b)		// A __device__ function is a function that is performed on the GPU; these
{															// can be called from within a kernel (see below) and are a good way of
															// keeping your GPU code debug-friendly. Note that you CAN pass by reference
															// to a __device__ function from within a __global__ kernel.
	*x = *a + *b;	// Simple addition

}


__global__ void addKernel(int* x, int* a, int* b, int aL)		// This is a simple kernel. The type __global__ tells the
{																// compiler that it should be run on the GPU and is called
																// by our external function (see cudaTest below)

	int tid = threadIdx.x;		// You'll find "tid" mentioned a lot in CUDA literature. Basically, many kernels use a
								// tid identifier to track threads. The threadIdx variable is contained in <device_launch_parameters.h>
								// and is something intrinsic to CUDA

	if(tid < aL)				// Check to ensure we don't go out of bounds; technically CUDA does this check for us in most
	{							// situations, but it's a good habit to get into
	
		vectorAddition(&x[tid], &a[tid], &b[tid]);	// Basically, this expression divides the work between threads, e.g. in the
													// second thread, vectorAddition(&x[2], &a[2], &b[2]) is called.	
	}
}

__device__ void interactionAB(float4 objectA, float4 objectB, float3* objectAAccel)
{
	/* This is a simplified function that incrementally changes the gravitational acceleration
	of object A based on objectB's mass and relative position */

	float3 distance;
	
	distance.x = objectA.x - objectB.x;
	distance.y = objectA.y - objectB.y;
	distance.z = objectA.z - objectB.z;

	float distanceSquared = (distance.x * distance.x) + (distance.y * distance.y) + (distance.z + distance.z);

	float ans = objectB.w/sqrtf(distanceSquared * distanceSquared * distanceSquared);	// sqrtf is CUDA's own fast sqrt function
																						// for floats - there're a lot of similar
																						// speedups in the CUDA API documentation
																						// available from NVIDIA's website
	objectAAccel->x += distance.x * ans;
	objectAAccel->y += distance.y * ans;
	objectAAccel->z += distance.z * ans;
}

__global__ void nBodyKernel(float4* a, float3* b, int nBodies)
{
	int tid = threadIdx.x;

	if(tid < nBodies)
	{
		float3 netAccel;
		netAccel.x = 0.0f;
		netAccel.y = 0.0f;
		netAccel.z = 0.0f;

		for(int i = 0; i < nBodies; i++)
		{
												// What do you think goes in here?
		}
		b[tid] = netAccel;
	}


}

void cudaTest(int* a, int* b, int aL)			// cudaTest is an external function called from main.cpp and declared above.
{
	int* cuda_a;								// First, we'll declare the pointers we're going to be using in our CUDA
	int* cuda_b;								// kernel - remember that variables have to be copied from host memory (RAM) to
	int* cuda_x;								// device memory (VRAM) in CUDA (the GPU can't read the CPU's memory)

	cudaMalloc((void**)&cuda_a, aL * sizeof(int));		// cudaMalloc allocates memory to our pointers, allowing you to treat
	cudaMalloc((void**)&cuda_b, aL * sizeof(int));		// them just like arrays in C++. Note that if you're passing something
	cudaMalloc((void**)&cuda_x, aL * sizeof(int));		// other than an int, e.g. a short, a struct or a class, you'll need to
														// use that variable's sizeof()

	cudaMemcpy(cuda_a, a, aL * sizeof(int), cudaMemcpyHostToDevice);	// Here we copy the contents of a into cuda_a
	cudaMemcpy(cuda_b, b, aL * sizeof(int), cudaMemcpyHostToDevice);	// Here we do the same for b

	addKernel<<<1, aL>>>(cuda_x, cuda_a, cuda_b, aL);					// The kernel syntax can be confusing. For our example,
																		// we assign one block of threads with aL threads in it.
																		// Intellisense doesn't like <<<>>> but it WILL compile

	cudaDeviceSynchronize();	// This makes our GPU wait until all threads in all kernels are finished (we only have one kernel,
								// so it isn't really necessary)

	cudaMemcpy(a, cuda_x, aL * sizeof(int), cudaMemcpyDeviceToHost);	// Here, we overwrite a with the contents of cuda_x

	cudaFree(cuda_a);		// Just like calling delete after new, we need to call cudaFree after cudaMalloc to release
	cudaFree(cuda_b);		// the memory our arrays have been occupying. If you intend on calling your CUDA functions regularly
	cudaFree(cuda_x);		// throughout your code, it's often a good idea to call cudaMalloc when your program starts, cudaFree
							// when your program ends, and store the pointers in an array you pass back and forth between your
							// C++ and CUDA code.

}